Library upload

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.1.6
## ✔ ggplot2   4.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'janitor'
## 
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
## 
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## 
## 
## Loading required package: lattice
## 
## 
## Attaching package: 'caret'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## 
## 
## Attaching package: 'MASS'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loaded glmnet 4.1-10
## 
## Loading required package: mvtnorm
## 
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Data upload

df <- readr::read_tsv("data/marketing_campaign.csv") %>%
  janitor::clean_names()
## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Content

AcceptedCmp1 - 1 if customer accepted the offer in the 1st campaign, 0 otherwise AcceptedCmp2 - 1 if customer accepted the offer in the 2nd campaign, 0 otherwise AcceptedCmp3 - 1 if customer accepted the offer in the 3rd campaign, 0 otherwise AcceptedCmp4 - 1 if customer accepted the offer in the 4th campaign, 0 otherwise AcceptedCmp5 - 1 if customer accepted the offer in the 5th campaign, 0 otherwise Response (target) - 1 if customer accepted the offer in the last campaign, 0 otherwise Complain - 1 if customer complained in the last 2 years DtCustomer - date of customer’s enrolment with the company Education - customer’s level of education Marital - customer’s marital status Kidhome - number of small children in customer’s household Teenhome - number of teenagers in customer’s household Income - customer’s yearly household income MntFishProducts - amount spent on fish products in the last 2 years MntMeatProducts - amount spent on meat products in the last 2 years MntFruits - amount spent on fruits products in the last 2 years MntSweetProducts - amount spent on sweet products in the last 2 years MntWines - amount spent on wine products in the last 2 years MntGoldProds - amount spent on gold products in the last 2 years NumDealsPurchases - number of purchases made with discount NumCatalogPurchases - number of purchases made using catalogue NumStorePurchases - number of purchases made directly in stores NumWebPurchases - number of purchases made through company’s web site NumWebVisitsMonth - number of visits to company’s web site in the last month Recency - number of days since the last purchase

Exploratory Data Analysis

glimpse(df)
## Rows: 2,240
## Columns: 29
## $ id                    <dbl> 5524, 2174, 4141, 6182, 5324, 7446, 965, 6177, 4…
## $ year_birth            <dbl> 1957, 1954, 1965, 1984, 1981, 1967, 1971, 1985, …
## $ education             <chr> "Graduation", "Graduation", "Graduation", "Gradu…
## $ marital_status        <chr> "Single", "Single", "Together", "Together", "Mar…
## $ income                <dbl> 58138, 46344, 71613, 26646, 58293, 62513, 55635,…
## $ kidhome               <dbl> 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, …
## $ teenhome              <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ dt_customer           <chr> "04-09-2012", "08-03-2014", "21-08-2013", "10-02…
## $ recency               <dbl> 58, 38, 26, 26, 94, 16, 34, 32, 19, 68, 11, 59, …
## $ mnt_wines             <dbl> 635, 11, 426, 11, 173, 520, 235, 76, 14, 28, 5, …
## $ mnt_fruits            <dbl> 88, 1, 49, 4, 43, 42, 65, 10, 0, 0, 5, 16, 61, 2…
## $ mnt_meat_products     <dbl> 546, 6, 127, 20, 118, 98, 164, 56, 24, 6, 6, 11,…
## $ mnt_fish_products     <dbl> 172, 2, 111, 10, 46, 0, 50, 3, 3, 1, 0, 11, 225,…
## $ mnt_sweet_products    <dbl> 88, 1, 21, 3, 27, 42, 49, 1, 3, 1, 2, 1, 112, 5,…
## $ mnt_gold_prods        <dbl> 88, 6, 42, 5, 15, 14, 27, 23, 2, 13, 1, 16, 30, …
## $ num_deals_purchases   <dbl> 3, 2, 1, 2, 5, 2, 4, 2, 1, 1, 1, 1, 1, 3, 1, 1, …
## $ num_web_purchases     <dbl> 8, 1, 8, 2, 5, 6, 7, 4, 3, 1, 1, 2, 3, 6, 1, 7, …
## $ num_catalog_purchases <dbl> 10, 1, 2, 0, 3, 4, 3, 0, 0, 0, 0, 0, 4, 1, 0, 6,…
## $ num_store_purchases   <dbl> 4, 2, 10, 4, 6, 10, 7, 4, 2, 0, 2, 3, 8, 5, 3, 1…
## $ num_web_visits_month  <dbl> 7, 5, 4, 6, 5, 6, 6, 8, 9, 20, 7, 8, 2, 6, 8, 3,…
## $ accepted_cmp3         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ accepted_cmp4         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ accepted_cmp5         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ accepted_cmp1         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ accepted_cmp2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ complain              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ z_cost_contact        <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ z_revenue             <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, …
## $ response              <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, …
dim(df)
## [1] 2240   29

Only the “Income” variable has zero values. Since there are only 24 missing values, it was decided to eliminate them directly.

skimr::skim(df)
Data summary
Name df
Number of rows 2240
Number of columns 29
_______________________
Column type frequency:
character 3
numeric 26
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
education 0 1 3 10 0 5 0
marital_status 0 1 4 8 0 8 0
dt_customer 0 1 10 10 0 663 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 5592.16 3246.66 0 2828.25 5458.5 8427.75 11191 ▇▇▇▇▇
year_birth 0 1.00 1968.81 11.98 1893 1959.00 1970.0 1977.00 1996 ▁▁▂▇▅
income 24 0.99 52247.25 25173.08 1730 35303.00 51381.5 68522.00 666666 ▇▁▁▁▁
kidhome 0 1.00 0.44 0.54 0 0.00 0.0 1.00 2 ▇▁▆▁▁
teenhome 0 1.00 0.51 0.54 0 0.00 0.0 1.00 2 ▇▁▇▁▁
recency 0 1.00 49.11 28.96 0 24.00 49.0 74.00 99 ▇▇▇▇▇
mnt_wines 0 1.00 303.94 336.60 0 23.75 173.5 504.25 1493 ▇▂▂▁▁
mnt_fruits 0 1.00 26.30 39.77 0 1.00 8.0 33.00 199 ▇▁▁▁▁
mnt_meat_products 0 1.00 166.95 225.72 0 16.00 67.0 232.00 1725 ▇▁▁▁▁
mnt_fish_products 0 1.00 37.53 54.63 0 3.00 12.0 50.00 259 ▇▁▁▁▁
mnt_sweet_products 0 1.00 27.06 41.28 0 1.00 8.0 33.00 263 ▇▁▁▁▁
mnt_gold_prods 0 1.00 44.02 52.17 0 9.00 24.0 56.00 362 ▇▁▁▁▁
num_deals_purchases 0 1.00 2.33 1.93 0 1.00 2.0 3.00 15 ▇▂▁▁▁
num_web_purchases 0 1.00 4.08 2.78 0 2.00 4.0 6.00 27 ▇▃▁▁▁
num_catalog_purchases 0 1.00 2.66 2.92 0 0.00 2.0 4.00 28 ▇▂▁▁▁
num_store_purchases 0 1.00 5.79 3.25 0 3.00 5.0 8.00 13 ▂▇▂▃▂
num_web_visits_month 0 1.00 5.32 2.43 0 3.00 6.0 7.00 20 ▅▇▁▁▁
accepted_cmp3 0 1.00 0.07 0.26 0 0.00 0.0 0.00 1 ▇▁▁▁▁
accepted_cmp4 0 1.00 0.07 0.26 0 0.00 0.0 0.00 1 ▇▁▁▁▁
accepted_cmp5 0 1.00 0.07 0.26 0 0.00 0.0 0.00 1 ▇▁▁▁▁
accepted_cmp1 0 1.00 0.06 0.25 0 0.00 0.0 0.00 1 ▇▁▁▁▁
accepted_cmp2 0 1.00 0.01 0.11 0 0.00 0.0 0.00 1 ▇▁▁▁▁
complain 0 1.00 0.01 0.10 0 0.00 0.0 0.00 1 ▇▁▁▁▁
z_cost_contact 0 1.00 3.00 0.00 3 3.00 3.0 3.00 3 ▁▁▇▁▁
z_revenue 0 1.00 11.00 0.00 11 11.00 11.0 11.00 11 ▁▁▇▁▁
response 0 1.00 0.15 0.36 0 0.00 0.0 0.00 1 ▇▁▁▁▂
df_clean <- df %>%
  mutate(
    across(c(
      id, education, marital_status,
      kidhome, teenhome,
      accepted_cmp1, accepted_cmp2, accepted_cmp3,
      accepted_cmp4, accepted_cmp5, response
    ), as.factor)
  )
df_clean <- df_clean %>% tidyr::drop_na(income)

sum(is.na(df_clean))
## [1] 0

The variables “z_cost_contact” and “z_revenue” have a constant value, as they do not add any information and are therefore eliminated from the analysis.

df %>%
  summarise(across(everything(), n_distinct)) %>%
  pivot_longer(everything(), names_to = "var", values_to = "n_unique") %>%
  filter(n_unique == 1)
## # A tibble: 2 × 2
##   var            n_unique
##   <chr>             <int>
## 1 z_cost_contact        1
## 2 z_revenue             1
df_clean <- df %>% dplyr::select(-z_cost_contact, -z_revenue)

Two variables with almost zero variance were found and therefore eliminated.

nzv <- nearZeroVar(df_clean, saveMetrics = TRUE)
nzv[nzv$zeroVar | nzv$nzv, ]
##               freqRatio percentUnique zeroVar  nzv
## accepted_cmp2  73.66667    0.08928571   FALSE TRUE
## complain      105.66667    0.08928571   FALSE TRUE
df_clean <- df_clean[, !nzv$zeroVar & !nzv$nzv]

The “Yolo,” “Absurd,” and ‘Alone’ modes of the “marital_status” variable have been removed, as they have a very low presence and distort the analysis.

df_clean <- df_clean %>%
  filter(!marital_status %in% c("YOLO", "Absurd")) %>%
  mutate(marital_status = fct_recode(marital_status,
    "Single" = "Alone" 
  )) %>%
  mutate(marital_status = droplevels(marital_status))

count(df_clean, marital_status)
## # A tibble: 5 × 2
##   marital_status     n
##   <fct>          <int>
## 1 Single           483
## 2 Divorced         232
## 3 Married          864
## 4 Together         580
## 5 Widow             77

The variables relating to children and young people in households have been grouped together under a single variable, “Children.”

df_clean <- df_clean %>%
  mutate(children = kidhome + teenhome) %>%
  dplyr::select(-kidhome, -teenhome)
df_clean <- df_clean %>%
  mutate(
    total_spent =
      mnt_wines + mnt_fruits + mnt_meat_products +
        mnt_fish_products + mnt_sweet_products +
        mnt_gold_prods
  )
df_clean <- df_clean %>%
  mutate(age = 2024 - year_birth) %>%
  dplyr::select(-year_birth)

A logarithmic transformation was applied to the Income feature to correct for the strong positive asymmetry (right-skewness) typical of income distributions.

df_clean <- df_clean %>%
  mutate(income_log = log(income)) %>%
  dplyr::select(-income)


df_clean <- df_clean %>%
  na.omit(is.na(income_log))
dim(df_clean)
## [1] 2212   25
df_clean %>%
  ggplot(aes(x = factor(response))) +
  geom_bar(fill = "#4E79A7", color = "white", alpha = 0.9, width = 0.6) +
  scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
  labs(
    title = "Distribution Response",
    x = "Response",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

df_clean %>%
  ggplot(aes(x = income_log)) +
  geom_histogram(bins = 60, fill = "#4682B4", color = "white", alpha = 0.8) +
  labs(
    title = "Distribution Income",
    x = "Log Income",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

df_clean <- df_clean %>% filter(id != 9432)
df_clean %>%
  ggplot(aes(x = income_log)) +
  geom_histogram(bins = 60, fill = "#4682B4", color = "white", alpha = 0.8) +
  labs(
    title = "Distribution Log Income",
    x = "Log Income",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

df_clean %>%
  ggplot(aes(x = reorder(education, education, function(x) -length(x)))) +
  geom_bar(fill = "#4682B4", color = "white", alpha = 0.8, width = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, color = "#333333") +
  labs(
    title = "Level of Education",
    x = "Educational Qualification",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  )

total_spent is a variable created to show the total expenditure incurred by the various instances. Its marked asymmetry is due to the underlying asymmetry of the various variables that comprise it.

df_clean %>%
  ggplot(aes(x = total_spent)) +
  geom_histogram(bins = 40, fill = "#4682B4", color = "white", alpha = 0.8) +
  labs(
    title = "Distribution Total Spent",
    x = "Total Spent",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

df_clean %>%
  ggplot(aes(x = factor(children))) +
  geom_bar(fill = "#4682B4", color = "white", alpha = 0.8) +
  labs(
    title = "Distribution Children",
    x = "Children",
    y = "Counts"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

As can be seen from the graphs, all the “mnt_” variables show strong asymmetry.

colonne_mnt <- df_clean %>%
  dplyr::select(starts_with("mnt")) %>%
  names()

lista_grafici <- map(colonne_mnt, function(col_name) {
  ggplot(df_clean, aes(x = .data[[col_name]])) + 
    geom_histogram(bins = 30, fill = "#4682B4", color = "white", alpha = 0.8) +
    scale_x_continuous(
      labels = label_dollar(prefix = "€ ", big.mark = ".", decimal.mark = ",")
    ) +
    labs(
      title = paste("Distribution", col_name), 
      x = "Amount spent",
      y = "Frequency"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      panel.grid.minor = element_blank()
    )
})

names(lista_grafici) <- colonne_mnt
walk(lista_grafici, print)

Recency is crucial for calculating the churn rate, indicating the number of days since the last purchase.

df_clean %>%
  ggplot(aes(x = recency)) +
  geom_histogram(binwidth = 5, fill = "#4682B4", color = "white", alpha = 0.8) +
  scale_x_continuous(breaks = seq(0, 100, 10)) +
  labs(
    title = "Distribution Recency",
    subtitle = "Days since the customer's last purchase",
    x = "Days since last purchase",
    y = "Number of Customers"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10))
  )

There is a strong correlation (0.64) between income and spending on wine (mnt_wines). High income is positively linked to purchases in stores (0.63) and from catalogs (0.59). There is a strong negative correlation (-0.61) between income and visits to the website. The correlation (0.74) is between mnt_meat_products and num_catalog_purchases (those who buy a lot of meat tend to do so via catalog). Those who spend on fish also tend to spend on fruit (0.59), meat (0.57), and desserts (0.59).

df_corr <- df_clean %>%
  dplyr::select(where(is.numeric)) %>%
  dplyr::select(-total_spent)%>%
  dplyr::select(-matches("ID|id"))

corr_matrix <- cor(df_corr, use = "complete.obs")

ggcorrplot(corr_matrix,
  method = "square",
  type = "lower",
  lab = FALSE,
  colors = c("#B2182B", "white", "#E41A1C"),
  title = "Correlation Matrix",
  ggtheme = theme_minimal()
) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
    panel.grid = element_blank()
  )
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
##   Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

classifica_correlazioni <- as.data.frame(corr_matrix) %>%
  
  rownames_to_column(var = "Var1") %>%
  pivot_longer(cols = -Var1, names_to = "Var2", values_to = "Correlazione") %>%
  filter(Var1 < Var2) %>%
  arrange(desc(abs(Correlazione)))

print("Higher correlations")
## [1] "Higher correlations"
head(classifica_correlazioni, 10)
## # A tibble: 10 × 3
##    Var1              Var2                  Correlazione
##    <chr>             <chr>                        <dbl>
##  1 mnt_meat_products num_catalog_purchases        0.735
##  2 mnt_wines         num_store_purchases          0.640
##  3 income_log        mnt_wines                    0.638
##  4 mnt_wines         num_catalog_purchases        0.635
##  5 income_log        num_store_purchases          0.625
##  6 income_log        num_web_visits_month        -0.607
##  7 income_log        num_catalog_purchases        0.593
##  8 mnt_fish_products mnt_fruits                   0.592
##  9 mnt_fish_products mnt_sweet_products           0.586
## 10 mnt_fish_products mnt_meat_products            0.574
df_clean <- df_clean%>%
  dplyr::select(-dt_customer)

Models for Non Gaussian Data

Mixture of Generalized Hyperbolic Distributions (MixGHD)

vars_clustering <- c("mnt_wines", "mnt_fruits", "mnt_meat_products", 
                     "mnt_fish_products", "mnt_sweet_products", "mnt_gold_prods")

X <- df_clean %>%
  dplyr::select(all_of(vars_clustering)) %>%
  scale()
set.seed(1926)
mod_mghd <- MGHD(data = X, G = 2:5, modelSel = "BIC", scale = FALSE)  #sceglie tra 2 e 5 cluster
## The best model (BIC) for the range of  components used is  G = 4.
## The BIC for this model is -7830.217.
df_clean$cluster <- factor(mod_mghd@map)

summary(mod_mghd)
## The number   components used for the model is  G = 4.
## BIC = -7830.217. AIC = -7151.774. AIC3 = -7270.774. ICL = -8215.909.
##   Cluster N. of elements
## 1       1            311
## 2       2            857
## 3       3            537
## 4       4            506
plot(mod_mghd)

pca_res <- prcomp(X, scale. = TRUE)

num_cluster <- length(unique(mod_mghd@map))

df_plot <- data.frame(
  PC1 = pca_res$x[, 1],
  PC2 = pca_res$x[, 2],
  Cluster = factor(mod_mghd@map) 
)

ggplot(df_plot, aes(x = PC1, y = PC2, color = Cluster, fill = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +             
  stat_ellipse(geom = "polygon", alpha = 0.2, level = 0.95) + 
  scale_color_brewer(palette = "Set1") +          
  scale_fill_brewer(palette = "Set1") +
  labs(
    title = "Clustering MGHD",
    subtitle = paste("Cluster:", num_cluster), 
    x = paste0("PC1 (", round(summary(pca_res)$importance[2,1]*100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca_res)$importance[2,2]*100, 1), "%)")
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

profile <- df_clean %>%
  group_by(cluster) %>%
  summarise(
    Num_Consumer = n(),
    Avg_Wines = mean(mnt_wines),
    Avg_Meat = mean(mnt_meat_products),
    Avg_Gold = mean(mnt_gold_prods),
    Avg_Income = exp(mean(income_log)), 
    Avg_Age = mean(age),
    Avg_web = mean(num_web_visits_month),
    Avg_purchases = mean(num_deals_purchases),
    Avg_tot_spent=mean(total_spent)
  ) %>%
  arrange(desc(Avg_tot_spent))

profile
## # A tibble: 4 × 10
##   cluster Num_Consumer Avg_Wines Avg_Meat Avg_Gold Avg_Income Avg_Age Avg_web
##   <fct>          <int>     <dbl>    <dbl>    <dbl>      <dbl>   <dbl>   <dbl>
## 1 4                506     529.     410.      90.6     71500.    55.9    3.37
## 2 1                311     593.     322.      28.2     66510.    57.3    4.05
## 3 3                537     366.      83.6     55.1     49920.    57.7    6.24
## 4 2                857      30.0     19.8     14.8     30688.    52.4    6.36
## # ℹ 2 more variables: Avg_purchases <dbl>, Avg_tot_spent <dbl>
df_clean$Segmento <- factor(df_clean$cluster, 
                            levels = c("4", "1", "3", "2"), 
                            labels = c("Premium", 
                                       "High Potential", 
                                       "Value Seekers", 
                                       "Budget/Low Income"))


table(df_clean$Segmento)
## 
##           Premium    High Potential     Value Seekers Budget/Low Income 
##               506               311               537               857
ggplot(df_clean, aes(x = Segmento, y = total_spent, fill = Segmento)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  
  scale_y_continuous(labels = scales::dollar_format()) +
  
  # coord_cartesian taglia solo la visualizzazione, non i dati
  coord_cartesian(ylim = c(0, 2500)) + 
  
  theme_minimal() +
  labs(title = "Type of Consumer for Total Spent",
       x = "Type of Consumer",
       y = "Total Spent") +
  theme(legend.position = "none")

ggplot(df_clean, aes(x = Segmento, y = income_log, fill = Segmento)) +
  # Uso geom_boxplot invece di geom_density
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  
  scale_y_continuous() +
  theme_minimal() +
  labs(title = "Type of Consumer for Total Spent",
       x = "Type of Consumer",
       y = "Income_log") +
  theme(legend.position = "none")

For 50% of the data, the uncertainty is less than 0.8% (median value), while the average is much higher, so there are few observations that have a lot of uncertainty (asymmetric distribution).

n_obs <- length(mod_mghd@map)
n_clusters <- 4            

z_matrix <- matrix(mod_mghd@z, nrow = n_obs, ncol = n_clusters)

uncertainty <- 1 - apply(z_matrix, 1, max)

summary(uncertainty)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0008156 0.0084983 0.0730895 0.0831913 0.5775939

The categories overlap and are not clearly distinct.

df_plot_unc <- data.frame(
  PC1 = pca_res$x[, 1],
  PC2 = pca_res$x[, 2],
  Cluster = factor(mod_mghd@map),
  Uncertainty = uncertainty
)


df_plot_unc <- df_plot_unc[order(df_plot_unc$Uncertainty), ]


ggplot(df_plot_unc, aes(x = PC1, y = PC2, color = Uncertainty)) +
  geom_point(size = 2, alpha = 0.8) +
  scale_color_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(
    title = "Classification Uncertainty Map",
    color = "Incertezza"
  )

Those belonging to cluster 4 purchase most of the food basket, but are not the leading purchasers of wine. Those belonging to cluster 1 are the leading consumers of wine, and favor meat and fish, neglecting everything else. Those belonging to cluster 2 all have negative values and are the opposite class to Premium (cluster 4). Finally, those belonging to cluster 3 are below average for meat, fruit, and fish but above average for wine and gold products.

df_final <- as.data.frame(X)
df_final$Cluster <- factor(mod_mghd@map)

cluster_profile <- df_final %>%
  group_by(Cluster) %>%
  summarise(across(everything(), mean)) %>%
  pivot_longer(-Cluster, names_to = "Variabile", values_to = "Media") %>%
  pivot_wider(names_from = Cluster, values_from = Media, names_prefix = "Cluster_") %>%
  mutate(across(where(is.numeric), ~ round(., 3)))

print(as.data.frame(cluster_profile))
##            Variabile Cluster_1 Cluster_2 Cluster_3 Cluster_4
## 1          mnt_wines     0.853    -0.815     0.182     0.663
## 2         mnt_fruits     0.086    -0.506    -0.486     1.321
## 3  mnt_meat_products     0.689    -0.656    -0.372     1.083
## 4  mnt_fish_products     0.508    -0.537    -0.499     1.126
## 5 mnt_sweet_products     0.108    -0.513    -0.488     1.321
## 6     mnt_gold_prods    -0.302    -0.562     0.218     0.906
profile_mghd <- df_clean%>%
  group_by(Segmento)%>%
  summarise(
    Count = n(),
    Avg_Wines = mean(mnt_wines),
    Avg_Meat = mean(mnt_meat_products),
    Avg_Gold = mean(mnt_gold_prods),
    Avg_Fish = mean(mnt_fish_products),
    Avg_Income = exp(mean(income_log)), 
    Avg_Age = mean(age),
    Avg_web = mean(num_web_visits_month),
    Avg_purchases = mean(num_deals_purchases),
    Avg_tot_spent=mean(total_spent)
  ) %>%
  arrange(desc(Avg_tot_spent))

print(profile_mghd)
## # A tibble: 4 × 11
##   Segmento Count Avg_Wines Avg_Meat Avg_Gold Avg_Fish Avg_Income Avg_Age Avg_web
##   <fct>    <int>     <dbl>    <dbl>    <dbl>    <dbl>      <dbl>   <dbl>   <dbl>
## 1 Premium    506     529.     410.      90.6    99.0      71500.    55.9    3.37
## 2 High Po…   311     593.     322.      28.2    65.3      66510.    57.3    4.05
## 3 Value S…   537     366.      83.6     55.1    10.3      49920.    57.7    6.24
## 4 Budget/…   857      30.0     19.8     14.8     8.23     30688.    52.4    6.36
## # ℹ 2 more variables: Avg_purchases <dbl>, Avg_tot_spent <dbl>
df_full <- as.data.frame(X)
df_full$Cluster <- factor(mod_mghd@map)
df_full$Uncertainty <- uncertainty

df_clean_cc <- df_full[df_full$Uncertainty < 0.2, ]

cat("Distribution of Cluster after filtering:\n")
## Distribution of Cluster after filtering:
print(table(df_clean_cc$Cluster))
## 
##   1   2   3   4 
## 229 788 456 431
cluster_means <- df_clean_cc %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean)) %>%
  dplyr::select(-Uncertainty) %>%
  pivot_longer(-Cluster, names_to = "Variabile", values_to = "Media") %>%
  pivot_wider(names_from = Cluster, values_from = Media, names_prefix = "Cluster_") %>%
  mutate(across(where(is.numeric), ~ round(., 3)))

print(as.data.frame(cluster_means))
##            Variabile Cluster_1 Cluster_2 Cluster_3 Cluster_4
## 1          mnt_wines     0.938    -0.830     0.210     0.667
## 2         mnt_fruits     0.021    -0.530    -0.497     1.434
## 3  mnt_meat_products     0.658    -0.671    -0.378     1.098
## 4  mnt_fish_products     0.461    -0.553    -0.514     1.164
## 5 mnt_sweet_products     0.050    -0.528    -0.497     1.438
## 6     mnt_gold_prods    -0.353    -0.590     0.286     1.002

The cluster that generates the most uncertainty is cluster 1, i.e., those who purchase the most wine.

check_uncertainty <- df_full %>%
  group_by(Cluster) %>%
  summarise(
    Incertezza_Media = mean(Uncertainty),
    Incertezza_Max = max(Uncertainty),
    Casi_Critici = sum(Uncertainty > 0.2),
    Totale_Casi = n()
  ) %>%
  mutate(Perc_Critici = round(Casi_Critici / Totale_Casi * 100, 1))

print(check_uncertainty)
## # A tibble: 4 × 6
##   Cluster Incertezza_Media Incertezza_Max Casi_Critici Totale_Casi Perc_Critici
##   <fct>              <dbl>          <dbl>        <int>       <int>        <dbl>
## 1 1                 0.141           0.524           82         311         26.4
## 2 2                 0.0468          0.506           69         857          8.1
## 3 3                 0.0697          0.500           81         537         15.1
## 4 4                 0.0796          0.578           75         506         14.8
ggplot(df_full, aes(x = Cluster, y = Uncertainty, fill = Cluster)) +
  geom_boxplot(alpha = 0.7) +
  geom_hline(yintercept = 0.2, linetype = "dashed", color = "red", size = 1) +
  labs(
    title = "Distribution of Uncertainty",
    y = "Uncertainty (!-Probability)"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Cluster 1 tends to overlap with cluster 4. Wine consumers share with premium consumers a high consumption of meat. However, in cluster 1, the rest of the products are not consumed in large quantities, so when one of these consumers buys a little more fruit (for example), they begin to resemble a premium customer.

z_matrix <- matrix(mod_mghd@z, nrow = nrow(X), ncol = 4)

idx_incerti_c1 <- which(mod_mghd@map == 1 & (1 - apply(z_matrix, 1, max)) > 0.2)

second_best <- apply(z_matrix[idx_incerti_c1, ], 1, function(row) {
  order(row, decreasing = TRUE)[2]
})


table(second_best)
## second_best
##  2  3  4 
## 11 15 56

The graph confirms that there are heavy wine drinkers in both clusters. “Is it a Wine Lover who made a big purchase today? Or is it a Premium who bought less than usual?”

# La variabile other_expenses considera tutti gli acquisti tranne quelli del vino
df_comparison <- as.data.frame(X) %>%
  mutate(
    Cluster = factor(mod_mghd@map),
    Uncertainty = uncertainty,
    Other_Expenses = mnt_fruits + mnt_meat_products + mnt_fish_products + mnt_sweet_products + mnt_gold_prods
  ) %>%
  filter(Cluster %in% c("1", "4"))


ggplot(df_comparison, aes(x = mnt_wines, y = Other_Expenses, color = Cluster)) +
  geom_point(data = subset(df_comparison, Uncertainty < 0.2), 
             alpha = 0.4, size = 1.5) +
  geom_point(data = subset(df_comparison, Uncertainty >= 0.2), 
             aes(shape = "Incerto"), size = 3, alpha = 0.8, stroke = 1.2) +
  
  scale_color_manual(values = c("1" = "red", "4" = "blue")) +
  labs(
    title = "Wine Lovers (1) vs Premium (4)",
    x = "Wine (std)",
    y = "Other Expenses (std)",
    shape = "Stato"
  ) +
  theme_minimal()

#i punti più grandi sono quelli che il clustering confonde

Teigen

The clusters have free shape, volume, and orientation. The number of degrees of freedom is the same for all clusters.

library(teigen)

set.seed(1926)
X_mat <- as.matrix(X)

#l'algoritmo sceglie liberamente il modello da utilizzare
mod_teigen <- teigen(X_mat, Gs = 2:6, verbose = FALSE)

(mod_teigen$modelname)
## [1] "UUUC"
(mod_teigen$G)
## [1] 6
summary(mod_teigen)
## ------------- Summary for teigen ------------- 
##             ------   RESULTS   ------     
##             Loglik:         -3380.51 
##             BIC:            -8054.821 
##             ICL:            -8411.959 
##             Model:          UUUC 
##             # Groups:       6 
## 
## 
## Clustering Table:   
## 
##   1   2   3   4   5   6 
## 251 262 691 423 227 357
if (is.null(mod_teigen$map)) {
  cluster_teigen <- factor(apply(mod_teigen$fuzzy, 1, which.max))
} else {
  cluster_teigen <- factor(mod_teigen$map)
}
print(length(cluster_teigen))
## [1] 2211
df_plot_teigen <- data.frame(
  PC1 = pca_res$x[, 1],
  PC2 = pca_res$x[, 2],
  Cluster = cluster_teigen
)

ggplot(df_plot_teigen, aes(x = PC1, y = PC2, color = Cluster, fill = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  stat_ellipse(geom = "polygon", alpha = 0.2, level = 0.95) +
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title = "Clustering Teigen (Modello: UUUC)",
    subtitle = "T-test with constant heavy tails",
    x = "PC1",
    y = "PC2"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

df_clean$Cluster_Teigen <- cluster_teigen

profile_teigen <- df_clean %>%
  group_by(Cluster_Teigen) %>%
  summarise(
    Count = n(),
    Avg_Wines = mean(mnt_wines),
    Avg_Meat = mean(mnt_meat_products),
    Avg_Gold = mean(mnt_gold_prods),
    Avg_Fish = mean(mnt_fish_products),
    Avg_Income = exp(mean(income_log)), 
    Avg_Age = mean(age),
    Avg_web = mean(num_web_visits_month),
    Avg_purchases = mean(num_deals_purchases),
    Avg_tot_spent=mean(total_spent)
  ) %>%
  arrange(desc(Avg_tot_spent))

print(profile_teigen)
## # A tibble: 6 × 11
##   Cluster_Teigen Count Avg_Wines Avg_Meat Avg_Gold Avg_Fish Avg_Income Avg_Age
##   <fct>          <int>     <dbl>    <dbl>    <dbl>    <dbl>      <dbl>   <dbl>
## 1 3                691     546.    412.      74.1     96.3      70963.    56.4
## 2 6                357     548.    148.      77.4     25.5      57490.    57.2
## 3 5                227     264.     37.3     26.5      2.99     46711.    58.0
## 4 1                251     120.     62.7     23.7     13.9      40605.    55.2
## 5 4                423      12.3    13.5     12.0      6.85     25966.    49.7
## 6 2                262      25.4     8.13     4.11     1.05     34609.    55.7
## # ℹ 3 more variables: Avg_web <dbl>, Avg_purchases <dbl>, Avg_tot_spent <dbl>

Further discrimination is again made against heavy wine drinkers. The two new clusters concern consumers who only buy wine and those who essentially only buy wine and meat.

df_clean$Segmento_teg <- factor(df_clean$Cluster_Teigen, 
                            levels = c("3","6","5" ,"1", "4", "2"), 
                            labels = c("Premium", 
                                       "High Potential",
                                       "Wine Lovers only",
                                       "Wine and meat",
                                       "Value Seekers", 
                                       "Budget/Low Income"))


table(df_clean$Segmento_teg)
## 
##           Premium    High Potential  Wine Lovers only     Wine and meat 
##               691               357               227               251 
##     Value Seekers Budget/Low Income 
##               423               262
prob_mat <- mod_teigen$fuzzy

uncertainty_vec <- 1 - apply(prob_mat, 1, max)
df_clean$Uncertainty_teg <- uncertainty_vec
df_clean$Cluster_Teigen <- cluster_teigen
uncertainty_summary <- df_clean %>%
  group_by(Cluster_Teigen) %>%
  summarise(
    Count = n(),
    Min_Unc = min(Uncertainty_teg),
    Mean_Unc = mean(Uncertainty_teg),
    Median_Unc = median(Uncertainty_teg),
    Max_Unc = max(Uncertainty_teg),
    SD_Unc = sd(Uncertainty_teg) 
  ) %>%
  arrange(desc(Mean_Unc))

print(uncertainty_summary)
## # A tibble: 6 × 7
##   Cluster_Teigen Count      Min_Unc Mean_Unc Median_Unc Max_Unc SD_Unc
##   <fct>          <int>        <dbl>    <dbl>      <dbl>   <dbl>  <dbl>
## 1 1                251 0.000914       0.110   0.0460      0.565 0.134 
## 2 6                357 0.000457       0.104   0.0343      0.517 0.140 
## 3 5                227 0.0000692      0.0940  0.0230      0.518 0.136 
## 4 4                423 0.000204       0.0679  0.0143      0.572 0.111 
## 5 2                262 0.0000356      0.0509  0.00458     0.522 0.0986
## 6 3                691 0.0000000433   0.0324  0.0000783   0.573 0.0924
#vengono evidenziati solo i punti ad alta incertezza
threshold <- 0.2
uncertain_points <- df_plot_teigen[df_clean$Uncertainty > threshold, ]
## Warning: Unknown or uninitialised column: `Uncertainty`.
ggplot(df_plot_teigen, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(data = df_plot_teigen, color = "grey90", alpha = 0.5, size = 1) +
  geom_point(data = uncertain_points, aes(color = Cluster), size = 2, alpha = 0.8) +
  stat_ellipse(geom = "polygon", alpha = 0.1, level = 0.95) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Uncertain Cases",
    x = "PC1",
    y = "PC2"
  ) +
  theme_minimal()

With this model, there is less uncertainty in associating observations with clusters.

uncertainty <- 1 - apply(mod_teigen$fuzzy, 1, max)

df_plot_unc_teigen <- data.frame(
  PC1 = pca_res$x[, 1],
  PC2 = pca_res$x[, 2],
  Cluster = factor(apply(mod_teigen$fuzzy, 1, which.max)),
  Uncertainty = uncertainty
)

df_plot_unc_teigen <- df_plot_unc_teigen[order(df_plot_unc_teigen$Uncertainty), ]

ggplot(df_plot_unc_teigen, aes(x = PC1, y = PC2, color = Uncertainty)) +
  geom_point(size = 2, alpha = 0.8) +
  stat_ellipse(aes(group = Cluster), color = "gray40", alpha = 0.3, type = "norm") +
  scale_color_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(
    title = "Uncertainty Classification",
    subtitle = "Blue = Confident classification | Red = High uncertainty (Overlap zones)",
    x = "PC1",
    y = "PC2",
    color = "Uncertainty"
  )

Trasformazione Box Cox

Finally, we applied Box Cox transformations to the highly skewed variables in order to apply k-means (despite knowing that this would not yield great results, given the nature of the data). As expected, it does not discriminate the data well, identifying two large clusters.

vars_to_cluster <- grep("mnt_", names(df_clean), value = TRUE)
data_cluster <- df_clean[, vars_to_cluster]

#Trasformazione Box Cox
apply_boxcox <- function(x) {
  
  if(min(x) <= 0) {
    x <- x + 1 
  }
  
  # Calcola il lambda ottimale
  bc_obj <- BoxCoxTrans(x)
  
  # Applica la trasformazione
  x_trans <- predict(bc_obj, x)
  return(x_trans)
}


data_transformed <- data_cluster %>%
  mutate(across(everything(), apply_boxcox))


data_scaled <- scale(data_transformed)

fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
  ggtitle("Optimal K")

set.seed(1926)
k_final <- 2
km_res <- kmeans(data_scaled, centers = k_final, nstart = 25)


df_clean$Cluster_Box <- as.factor(km_res$cluster)
#visualizzazione PCA
fviz_cluster(km_res, data = data_scaled,
             palette = "jco", 
             ggtheme = theme_minimal(),
             main = "Clustering after Box-Cox Transformation")

df_clean %>%
  group_by(Cluster_Box) %>%
  summarise(across(all_of(vars_to_cluster), mean)) %>%
  print()
## # A tibble: 2 × 7
##   Cluster_Box mnt_wines mnt_fruits mnt_meat_products mnt_fish_products
##   <fct>           <dbl>      <dbl>             <dbl>             <dbl>
## 1 1                94.5       3.93              29.9              5.42
## 2 2               524.       49.6              310.              70.9 
## # ℹ 2 more variables: mnt_sweet_products <dbl>, mnt_gold_prods <dbl>

Comparison between models

The model that uses generalized hyperbolic distribution tends to perform better in terms of ICL and BIC. BOX-COX transformations were tested purely out of curiosity. Given the nature of the data, outliers were cut, determining them using the interquartile range, but the results did not show any significant improvement. In particular, for the model that uses generalized hyperbolic distributions, the evaluation metrics improved by a few points, while the Teigen model lost much of its power and greatly increased its uncertainty in the allocation of observations.